Supercias Companies Analysis#
First we are goint to ge the date from the supercias ranking website
packages = ['seaborn', 'matplotlib', 'numpy', 'altair', 'pandas', 'numpy', 'os', 'pymongo', 'gridfs']
for package in packages:
try:
__import__(package)
except ImportError:
!pip install {package}
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import ScalarFormatter
import altair as alt
import os
# set the default renderer to vega
alt.data_transformers.enable("vegafusion")
alt.renderers.enable('default')
# Set display format for floating-point numbers
pd.options.display.float_format = '{:,.2f}'.format
#check if the following files are in the Downloads folder
domain = 'https://appscvsmovil.supercias.gob.ec/ranking/recursos/'
# the user should change the folder path to the one where the files are located
folder = '/home/' + os.environ['USER'] + '/Downloads/'
ranking_path = folder + 'bi_ranking.csv' if 'bi_ranking.csv' in os.listdir(folder) else domain + 'bi_ranking.csv'
ids_path = folder + 'bi_compania.csv' if 'bi_compania.csv' in os.listdir(folder) else domain + 'bi_compania.csv'
segmentos_path = folder + 'bi_segmento.csv' if 'bi_segmento.csv' in os.listdir(folder) else domain + 'bi_segmento.csv'
ciiu_path = folder + 'bi_ciiu.csv' if 'bi_ciiu.csv' in os.listdir(folder) else domain + 'bi_ciiu.csv'
# read from a csv file into a pd dataframe
df_ranking = pd.read_csv(ranking_path, low_memory=False)
# the companias string
df_ids = pd.read_csv(ids_path, low_memory=False)
# segementos
df_segmentos = pd.read_csv(segmentos_path, low_memory=False)
# Código de Clasificacón Industrial Internacional Unifrome
df_ciiu = pd.read_csv(ciiu_path, low_memory=False)
We also want to read from the the mondo BD and get all fo teh documets from each company#
from pymongo import MongoClient
from gridfs import GridFS
# Connect to MongoDB on local host with ip address
endpoint = 'mongodb://10.0.10.5:27017'
db = MongoClient(endpoint)['supercias_ranking']
#companies = db['companies_suggestions']
# check the that here is a mongodb collection
#print(companies.count_documents({}))
Let’s only get the year 2023 and make the passivos by subtracting the patrimonio from the activos#
let’s merge all of the df into one#
# get the anio with 2023
df_ranking = df_ranking[df_ranking['anio'] == 2023]
# rename the cuii column
df_ciiu = df_ciiu.rename(columns={'descripcion': 'ciiu_desc', 'ciiu': 'ciiu_code'})
df_ciiu['ciiu_code'] = df_ciiu['ciiu_code'].str.strip()
# match all of the expedientes in the df_ids with the expedientes in the df_ranking
df = pd.merge(df_ranking, df_ids, on='expediente', how='left')
# let's merge the ciiu and the segments with the
df = pd.merge(df, df_ciiu, left_on='ciiu_n1', right_on='ciiu_code', how='left')
df.rename(columns={'ciiu_code': 'ciiu_n1_code', 'ciiu_desc': 'ciiu_n1_desc'}, inplace=True)
df = pd.merge(df, df_ciiu, left_on='ciiu_n6', right_on='ciiu_code', how='left')
df.rename(columns={'ciiu_code': 'ciiu_n6_code', 'ciiu_desc': 'ciiu_n6_desc'}, inplace=True)
# get the passivos by subtracting patrimonio from activos
df['pasivos'] = df['activos'] - df['patrimonio']
# check that there are error where activos is not equal to passivos + patrimonio
#df_error = df[df['activos'] != df['pasivos'] + df['patrimonio']]
# get the name and the passivos columns only
#df_error = df_error[['nombre', 'activos', 'pasivos', 'patrimonio']]
#df_error['margin'] = df_error['activos'] - df_error['pasivos'] - df_error['patrimonio']
# df print nu
#df_error.head()
#print(df.columns)
let seperate the companies into categoris by the number of activos#
How we want ot to know that is the ditribution of
# make companies categories between 0 and 2.5 billion in assets
# small companies: 0 - 500 thousand
# medium companies: 500 thousand - 50 million
# large companies: 50 million - up
df_small = df[df['activos'] < 500000]
df_medium = df[(df['activos'] >= 500000) & (df['activos'] < 50000000)]
df_large = df[(df['activos'] >= 50000000)]
# get the number of companies in each category and make a pie chart
pie_df = pd.DataFrame({
'size': ['Small - 0 - 500k', 'Medium - 500k - 50m', 'Large - 50 - up'],
'count': [df_small.shape[0], df_medium.shape[0], df_large.shape[0]]
})
pie_df['percentage'] = (pie_df['count'] / pie_df['count'].sum()) * 100
# Define custom colors for each category using named colors
color_scale = alt.Scale(domain=['Small - 0 - 500k', 'Medium - 500k - 50m', 'Large - 50m- up'],
range=['steelblue', 'orange', 'mediumseagreen' ])
pie_df.head()
pie_chart = alt.Chart(pie_df).mark_arc().encode(
theta=alt.Theta(field="count", type="quantitative"),
color=alt.Color(field="size", type="nominal", scale=color_scale),
tooltip=['size', 'count', alt.Tooltip('percentage:Q', format='.1f', title='percentage')]
).interactive()
pie_chart.display()
def make_pie_chart(df, title):
type_counts = df['tipo'].value_counts().reset_index()
type_counts.columns = ['tipo', 'count']
type_counts['percentage'] = (type_counts['count'] / type_counts['count'].sum()) * 100
return alt.Chart(type_counts).mark_arc().encode(
theta=alt.Theta(field="count", type="quantitative"),
color=alt.Color(field="tipo", type="nominal"),
tooltip=['tipo', 'count', alt.Tooltip('percentage:Q', format='.1f',title='percentage')]
# show percentge in legentd
).interactive().properties(
title=title
)
all_pie = make_pie_chart(df, 'All Companies')
small_pie = make_pie_chart(df_small, 'Small Companies')
medium_pie = make_pie_chart(df_medium, 'Medium Companies')
large_pie = make_pie_chart(df_large, 'Large Companies')
# Combine the charts into two columns
column1 = alt.vconcat(all_pie, small_pie)
column2 = alt.vconcat(medium_pie, large_pie)
# Combine the charts
# make the legend independent
# make two columns
chart = alt.hconcat(
column1, column2
).resolve_legend(
size="independent"
).properties(
title='Distribution of Companies types by Size',
)
chart.show()
def make_chart(df, title='Activos de Companias', max_bins=100, color='steelblue', width=450, height=300):
# Create a selection
click = alt.selection_point(encodings=['x'])
# Create the right bar chart
bars = alt.Chart(df).mark_bar().encode(
x=alt.X('activos:Q', bin=alt.Bin(maxbins=max_bins), title='Activos'),
y=alt.Y( 'count()', title='Numero de companias'),
color=alt.condition(click, alt.ColorValue(color), alt.ColorValue(color)),
tooltip=['count():Q']
).add_params(
click
).properties(
width=width,
height=height,
title=title,
).interactive()
# Create the left chart (scatter plot)
scatter = alt.Chart(df).mark_circle().encode(
x='pasivos:Q',
y='activos:Q',
color='ciiu_n1_desc:N',
tooltip=['nombre:N', 'activos:Q', 'pasivos:Q', 'patrimonio:Q', 'ciiu_n1_desc:N']
).transform_filter(
click
).properties(
width=width,
height=height,
title=''
).interactive()
# Filter the left chart based on selection
filtered_scatter = scatter.transform_filter(
click
)
# Combine the charts
chart = alt.vconcat(filtered_scatter, bars).resolve_legend(
)
chart.show()
make_chart(df_small, title='Activos de Companias pequenas', color='steelblue')